import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
vehicle = pd.read_csv('vehicle.csv')
vehicle.shape
print('The vehicle data provided has 19 columns and 846 rows(observations)')
vehicle.head()
vehicle.isnull().values.any()
print('There seems null values in the data, which we have to treat')
As the number of NA values observations are very less, we will drop Rows with NA values in them in any of the columns
vehicle1= vehicle.dropna(axis =0, how ='any')
vehicle1.info()
vehicle_null2 = vehicle1.isnull().sum()
vehicle_null2
Now we have 813 rows with all acceptble values
vehicle1.describe()
sns.pairplot(vehicle1, diag_kind = 'kde', hue = 'class')
Observations from pairplot: There are minimum 3 Gaussian in the pairplot. That means we have minimum 3 clusters in the data, which is clear from case problem.
There is multi collinearity in the data. Many variables are very correlated.
X = vehicle1.drop('class', axis=1)
y = vehicle1['class']
corr_X = X.corr(method ='pearson')
mask = np.zeros_like(corr_X)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr_X,cmap = 'RdYlGn_r',vmax = 1.0, vmin = -1.0, mask = mask, linewidths=2.5)
plt.yticks(rotation =0)
plt.xticks(rotation = 90)
plt.show()
from scipy.stats import zscore
XScale = X.apply(zscore)
XScale.describe()
XScale = XScale.drop(['compactness','circularity','distance_circularity','scatter_ratio','elongatedness','pr.axis_rectangularity','max.length_rectangularity','scaled_variance','scaled_variance.1','scaled_radius_of_gyration','scaled_radius_of_gyration.1'], axis =1)
#XScale = XScale.drop('circularity', axis =1)
#XScale = XScale.drop('distance_circularity', axis =1)
#XScale = XScale.drop('scatter_ratio', axis =1)
#XScale = XScale.drop('elongatedness', axis =1)
#XScale = XScale.drop('pr.axis_rectangularity', axis =1)
#XScale = XScale.drop('max.length_rectangularity', axis =1)
#XScale = XScale.drop('scaled_variance', axis =1)
#XScale= XScale.drop('scaled_variance.1', axis =1)
#XScale = XScale.drop('scaled_radius_of_gyration', axis =1)
#XScale = XScale.drop('scaled_radius_of_gyration.1', axis =1)
XScale.describe()
# Split X and y in to train and test set in the ratio 70:30
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XScale,y, test_size=.30, random_state=1)
from sklearn import svm
clr = svm.SVC()
clr.fit(X_train , y_train)
y_pred = clr.predict(X_test)
clr.score(X_train, y_train)
clr.score(X_test , y_test)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 20
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = clr
results = cross_val_score(model, XScale, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# generating the covariance matrix and the eigen values for the PCA analysis
from sklearn.decomposition import PCA
cov_matrix = np.cov(XScale.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
pca = PCA(n_components=7)
pca.fit(XScale)
#generating the eigen values and the eigen vectors
#e_vals, e_vecs = np.linalg.eig(cov_matrix)
#print('Eigenvectors \n%s' %e_vecs)
#print('\nEigenvalues \n%s' %e_vals)
We can see there are 7 Principal COmponents generated for 7 Independant variables
# the "cumulative variance explained" analysis
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.bar(list(range(1,8)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,8)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca5 = PCA(n_components=5)
pca5.fit(XScale)
print(pca5.components_)
Xpca5 = pca5.transform(XScale)
Xpca5
# Split X and y in to train and test set in the ratio 70:30
from sklearn.model_selection import train_test_split
Xpca_train, Xpca_test, ypca_train, ypca_test = train_test_split(Xpca5,y, test_size=.30, random_state=1)
from sklearn import svm
clpca = svm.SVC()
clpca.fit(Xpca_train , ypca_train)
clpca.score(Xpca_train, ypca_train)
clpca.score(Xpca_test, ypca_test)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 20
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = clpca
results_cv = cross_val_score(model, Xpca5, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results_cv.mean()*100.0, results_cv.std()*100.0))
print('Accuracy on test data with SVM with origial independant data set:')
print(clr.score(X_test , y_test))
print('Accuracy on test data with SVM with reduced dimensions (5 Principal Components) independant data set:')
print(clpca.score(Xpca_test, ypca_test))
We can see that the model accuracy decreased when we used selected Principal components, this is because we have dropped some principal components and hence the variance define by our selected components is limited now(95%)
print('Accuracy on original data with SVM using K fold validation set:')
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
print('Accuracy on data with SVM with reduced dimensions (5 Principal Components) independant data set and using K fold Validaion:')
print("Accuracy: %.3f%% (%.3f%%)" % (results_cv.mean()*100.0, results_cv.std()*100.0))
We can observe that the Overall average accuracy on data is increased by using K fold validaion. Here we have taken 20 folds for K-Folds.
The model has run 20 iterations on our data and calculated the accuracies in each iteration for each sample. The average of the accurcies for each iteration is taken, and we can assume that there are high possibilities that our model will perform with same accuracy in production aswell.
Though it is observalble that with Principal COmponents, the accuracy loers because of reduced dimensions. But that is not a big issue if we want to work on just 5 Independant variables compared to 19 that we have had in our provided data set. So in this analysis we have initially reduced the independant variable by removing highly correlated variables and then by using Principal Component Ananlysis. This kidn of reducton may help in avoiding the curse of dimensionality, overfitting and reducing the cost of training and running model.